{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "expression = \"\"\"\n",
    "        (^(?!(?:[0-9]{1,3}\\.){3}[0-9]{1,3}$).*$)|  # will match non valid ipV4\n",
    "        (^127\\.0\\.0\\.1)|  # will match 127.0.0.1\n",
    "        (^10\\.)|  # will match 10.0.0.0 - 10.255.255.255 IP-s\n",
    "        (^172\\.1[6-9]\\.)|  # will match 172.16.0.0 - 172.19.255.255 IP-s\n",
    "        (^172\\.2[0-9]\\.)|  # will match 172.20.0.0 - 172.29.255.255 IP-s\n",
    "        (^172\\.3[0-1]\\.)|  # will match 172.30.0.0 - 172.31.255.255 IP-s\n",
    "        (^192\\.168\\.)  # will match 192.168.0.0 - 192.168.255.255 IP-s\n",
    "    \"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "regex=re.compile(regex,'')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "regex=re.compile('\\s\\s')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "text='''\n",
    "19980101-01-001-001/m  迈向/v  充满/v  希望/n  的/u  新/a  世纪/n  ——/w  一九九八年/t  新年/t  讲话/n  （/w  附/v  图片/n  １/m  张/q  ）/w\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "19980101-01-001-001/m\n",
      "迈向/v\n",
      "充满/v\n",
      "希望/n\n",
      "的/u\n",
      "新/a\n",
      "世纪/n\n",
      "——/w\n",
      "一九九八年/t\n",
      "新年/t\n",
      "讲话/n\n",
      "（/w\n",
      "附/v\n",
      "图片/n\n",
      "１/m\n",
      "张/q\n",
      "）/w\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(re.sub(regex,'\\n',text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " 驱动器 D 中的卷没有标签。\n",
      " 卷的序列号是 2E66-0EF2\n",
      "\n",
      " D:\\github\\A_DJH\\DJH-Python\\libs\\re 的目录\n",
      "\n",
      "2018/03/14  15:59    <DIR>          .\n",
      "2018/03/14  15:59    <DIR>          ..\n",
      "2018/03/14  14:07    <DIR>          .ipynb_checkpoints\n",
      "2018/03/14  14:01            57,467 testall.txt\n",
      "2018/03/14  15:59             2,831 Untitled.ipynb\n",
      "               2 个文件         60,298 字节\n",
      "               3 个目录 87,200,976,896 可用字节\n"
     ]
    }
   ],
   "source": [
    "ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import codecs\n",
    "text = []\n",
    "with codecs.open('./testall.txt', 'rb', encoding='utf-8') as f:\n",
    "    text.extend(f.readlines())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19980101-01-001-003/m\n",
      "（/w\n",
      "一九九七年/t\n",
      "十二月/t\n",
      "三十一日/t\n",
      "）/w\n"
     ]
    }
   ],
   "source": [
    "regex=re.compile('\\d{2,4}年/t|[零一二三四五六七八九十]{2,4}年/t|\\d{1,2}月/t|[一二三四五六七八九十]{1,2}月/t|\\d{1,3}日/t|[一二三四五六七八九十]{1,3}日/t|\\d{1,2}时/t|[零一二三四五六七八九十]{1,3}时/t|\\d{1,3}分/t|[零一二三四五六七八九十]{1,3}分/t|\\d{1,3}秒/t|[零一二三四五六七八九十]{1,3}秒/t|[零一二三四五六七八九十]{1,3}点/t')\n",
    "sent='19980101-01-001-003/m  （/w  一九九七年/t  十二月/t  三十一日/t  ）/w'\n",
    "tokens=sent.split('  ')\n",
    "for t in tokens:\n",
    "    print(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 4 ['一九九七年/t', '十二月/t', '三十一日/t']\n"
     ]
    }
   ],
   "source": [
    "groups=[]\n",
    "first_index=-1\n",
    "last_index=-1\n",
    "i=0\n",
    "j=0\n",
    "while i<len(tokens):\n",
    "    t=tokens[i]\n",
    "    if regex.match(t):\n",
    "        if first_index==-1:#找到块的第一个时间\n",
    "            first_index=i\n",
    "            j=0\n",
    "            while j<len(tokens[i+1:]):\n",
    "                tt=tokens[i+1+j]\n",
    "                #if regex.match(tt) or ''==tt.strip():#解决相邻时间有多个空格的时候无法将相邻时间分在一个块的问题，其实是语料本身的问题\n",
    "                if regex.match(tt):\n",
    "                    last_index=i+1+j#找到块的最后一个时间\n",
    "                    j+=1\n",
    "                else:\n",
    "                    last_index=i+j#找到块的最后一个时间\n",
    "                    break\n",
    "            print(first_index,last_index,tokens[first_index:last_index+1])\n",
    "            groups.append((first_index,last_index))\n",
    "            if last_index==-1:\n",
    "                break\n",
    "            i=last_index\n",
    "            first_index=-1\n",
    "            last_index=-1\n",
    "    i+=1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['一九九七年十二月三十一日/t']\n"
     ]
    }
   ],
   "source": [
    "merges=[]\n",
    "for i,g in enumerate(groups):\n",
    "    a,b=g\n",
    "    time_tokens=tokens[a:b+1]\n",
    "    merge = ''\n",
    "    for t in time_tokens:\n",
    "        merge += t[:-2]\n",
    "    merge += '/t'\n",
    "    merges.append(merge)\n",
    "print(merges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 491,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "merges.reverse()\n",
    "for i,g in enumerate(groups[::-1]):\n",
    "    a,b=g\n",
    "    tokens[a:b]=[merges[i]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 492,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['希望/n',\n",
       " '的/u',\n",
       " '新/a',\n",
       " '世纪/n',\n",
       " '１９９８年12月12日12时/t',\n",
       " '希望/n',\n",
       " '的/u',\n",
       " '新/a',\n",
       " '1998年12月/t',\n",
       " '。/w']"
      ]
     },
     "execution_count": 492,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 472,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['希望/n',\n",
       " '的/u',\n",
       " '新/a',\n",
       " '世纪１９９８年12月12日12时/t',\n",
       " '希望/n',\n",
       " '的/u',\n",
       " '新1998年12月/t',\n",
       " '。/w']"
      ]
     },
     "execution_count": 472,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'１９９８年12月12日12时/t'"
      ]
     },
     "execution_count": 235,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 493,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "s='123/asd'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 494,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'123 asd'"
      ]
     },
     "execution_count": 494,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.replace('/',' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import sys\n",
    "import codecs\n",
    "import datetime\n",
    "\n",
    "def load_files(files):\n",
    "\t'''\n",
    "\t:param files:文件列表\n",
    "\t:return:文件内容\n",
    "\t'''\n",
    "\ttext = []\n",
    "\tprint(files)\n",
    "\tfor file in files:\n",
    "\t\tif file:\n",
    "\t\t\twith codecs.open(file, 'rb', encoding='utf-8') as f:\n",
    "\t\t\t\ttext.extend(f.readlines())\n",
    "\treturn text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['./testall.txt']\n"
     ]
    }
   ],
   "source": [
    "text = load_files(['./testall.txt'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['19980101-01-001-001/m  迈向/v  充满/v  希望/n  的/u  新/a  世纪/n  ——/w  一九九八年/t  新年/t  讲话/n  （/w  附/v  图片/n  １/m  张/q  ）/w\\n',\n",
       " '19980101-01-001-002/m  中共中央/nt  总书记/n  、/w  国家/n  主席/n  江/nr  泽民/nr\\n',\n",
       " '19980101-01-001-003/m  （/w  一九九七年/t  十二月/t  三十一日/t  ）/w\\n',\n",
       " '19980101-01-001-004/m  １２月/t  ３１日/t  ，/w  中共中央/nt  总书记/n  、/w  国家/n  主席/n  江/nr  泽民/nr  发表/v  １９９８年/t  新年/t  讲话/n  《/w  迈向/v  充满/v  希望/n  的/u  新/a  世纪/n  》/w  。/w  （/w  新华社/nt  记者/n  兰/nr  红光/nr  摄/Vg  ）/w\\n',\n",
       " '19980101-01-001-005/m  同胞/n  们/k  、/w  朋友/n  们/k  、/w  女士/n  们/k  、/w  先生/n  们/k  ：/w\\n',\n",
       " '19980101-01-001-006/m  在/p  １９９８年/t  来临/v  之际/f  ，/w  我/r  十分/m  高兴/a  地/u  通过/p  [中央/n  人民/n  广播/vn  电台/n]nt  、/w  [中国/ns  国际/n  广播/vn  电台/n]nt  和/c  [中央/n  电视台/n]nt  ，/w  向/p  全国/n  各族/r  人民/n  ，/w  向/p  [香港/ns  特别/a  行政区/n]ns  同胞/n  、/w  澳门/ns  和/c  台湾/ns  同胞/n  、/w  海外/s  侨胞/n  ，/w  向/p  世界/n  各国/r  的/u  朋友/n  们/k  ，/w  致以/v  诚挚/a  的/u  问候/vn  和/c  良好/a  的/u  祝愿/vn  ！/w']"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
